Consider data from example used in Statistical Analysis and Machine Learning and Predictive Analytics. Compare results with workshop.
myData <- swiss
x <- myData %>% select(-Fertility)
data_list <- list(
Ntotal = nrow(x),
Nx = ncol(x),
x = as.matrix(x),
y = myData$Fertility
)
Fertility using all available predictors without shrinkagedata {
int<lower=1> Ntotal;
int<lower=1> Nx;
vector[Ntotal] y;
matrix[Ntotal, Nx] x;
}
transformed data {
real expLambda;
real mean_y;
real sd_y;
real unifLo;
real unifHi;
expLambda = 1 / 30.0;
mean_y = mean(y);
sd_y = sd(y);
unifLo = sd_y / 1000;
unifHi = sd_y * 1000;
}
parameters {
real<lower=0> nu;
real beta0;
vector[Nx] xbeta;
real<lower=0> sigma;
}
transformed parameters{
vector[Ntotal] y_hat;
y_hat = beta0 + x * xbeta;
}
model {
nu ~ exponential(expLambda);
sigma ~ uniform(unifLo, unifHi);
y ~ student_t(nu, y_hat, sigma);
}
fit_robust_all_preds <- sampling(
model_robust_all_preds,
data = data_list,
pars = c(
"beta0",
"xbeta",
"sigma",
"nu"
),
iter = 2500,
chains = 4,
cores = 4,
control = list(
max_treedepth = 15
)
)
Probably not robust due to high \(\nu\).
plot(fit_robust_all_preds, pars = "nu")
The most notable correlations:
\(\beta_0\) and \(\beta_5\) have a strong negative relationship
\(\beta_0\) and \(\beta_1\) also have a noticably strong negative relationship
There are several others with correlations higher than zero, but none to be alarmed about (no narrow tunnel)
pairs(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))
Traces look good and no auto-correlation issues:
stan_trace(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))
stan_ac(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))
We have smooth densities with no shrinkage:
stan_dens(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))
\(\beta_1,\beta_3,\beta_4,\beta_5\) are significant. Note that \(\beta_1\) and \(\beta_4\) is close to zero, but still significant based on 95% HDI.
plot(fit_robust_all_preds, pars = "xbeta")
betas <- rstan::extract(fit_robust_all_preds, pars = c("beta0", "xbeta"))
hdi(betas$beta0)
## lower upper
## 46.06266 88.22377
## attr(,"credMass")
## [1] 0.95
hdi(betas$xbeta)
##
## [,1] [,2] [,3] [,4] [,5]
## lower -0.31913810 -0.7861737 -1.2391002 0.03280222 0.3214482
## upper -0.03401911 0.2335651 -0.4687047 0.17688555 1.8540699
## attr(,"credMass")
## [1] 0.95
\(\nu\) is high but the HDI is very wide (and includes below 10). Therefore, it is difficult to conclude that Gaussian assumption is satisfied given the uncertainty of \(\nu\)
nu <- rstan::extract(fit_robust_all_preds, pars = "nu")
hdi(nu)
## $nu
## lower upper
## 2.097622 95.926288
## attr(,"credMass")
## [1] 0.95
Fertility using all available predictors with shrinkagedata {
int<lower=1> Ntotal;
int<lower=1> Nx;
vector[Ntotal] y;
matrix[Ntotal, Nx] x;
}
transformed data {
real expLambda;
real meanY;
real sdY;
real unifLo;
real unifHi;
vector[Ntotal] zy; // normalized
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
expLambda = 1 / 30.0;
meanY = mean(y);
sdY = sd(y);
unifLo = sdY / 1000;
unifHi = sdY * 1000;
zy = (y - meanY) / sdY;
for (j in 1:Nx) {
meanX[j] = mean(x[, j]);
sdX[j] = sd(x[, j]);
for (i in 1:Ntotal) {
zx[i, j] = (x[i, j] - meanX[j]) / sdX[j];
}
}
}
parameters {
real zbeta0;
real<lower=0> sigmaBeta;
vector[Nx] zbeta;
real<lower=0> nu;
real<lower=0> zsigma;
}
transformed parameters{
vector[Ntotal] zy_hat;
zy_hat = zbeta0 + zx * zbeta;
}
model {
zbeta0 ~ normal(0, 2);
sigmaBeta ~ gamma(2.3,1.3); // mode 0, sd 0.5
// we want to allow sigma to be as small as necessary; we want it to be zero,
// but let it deviate; when sigma close to zero, sigma is much stronger
zbeta ~ student_t(expLambda, 0, sigmaBeta);
nu ~ exponential(expLambda);
zsigma ~ uniform(unifLo, unifHi);
zy ~ student_t(1 + nu, zy_hat, zsigma);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
real sigma;
// .* and ./ are element-wise product and divide
beta0 = zbeta0 * sdY + meanY - sdY * sum(zbeta .* meanX ./ sdX);
beta = sdY * (zbeta ./ sdX);
sigma = zsigma * sdY;
}
fit_robust_all_preds_shrinkage <- sampling(
model_robust_all_preds_shrinkage,
data = data_list,
pars = c(
"beta0",
"beta",
"sigma",
"nu"
),
iter = 5000,
chains = 4,
cores = 4
)
Shrinkage is not strong:
plot(fit_robust_all_preds_shrinkage, pars = "beta")
stan_dens(fit_robust_all_preds, pars = c("beta0", "xbeta","sigma", "nu"))
If we reduce the parameters of \(\gamma\), the distribution moves closer to zero, which should influence \(\sigma\beta\).
data.frame(new_gamma = dgamma(seq(0, 1, .01), .8, 1.5),
original_gamma = dgamma(seq(0, 1, .01), 1.3, 2.3)) %>%
gather(key = "parameter", value = value) %>%
ggplot(aes(value, fill = parameter)) +
geom_density(alpha = .75, color = "#d9d9d9") +
scale_fill_viridis_d(name = NULL, labels = c(expression(dgamma(.8, 1.5)), expression(dgamma(1.3, 2.3)))) +
labs(title = latex2exp::TeX("New $\\gamma$ Encourages Movement to Zero"),
x = NULL, y = "Density") +
theme(legend.position = "top")
data {
int<lower=1> Ntotal;
int<lower=1> Nx;
vector[Ntotal] y;
matrix[Ntotal, Nx] x;
}
transformed data {
real expLambda;
real meanY;
real sdY;
real unifLo;
real unifHi;
vector[Ntotal] zy; // normalized
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
expLambda = 1 / 30.0;
meanY = mean(y);
sdY = sd(y);
unifLo = sdY / 1000;
unifHi = sdY * 1000;
zy = (y - meanY) / sdY;
for (j in 1:Nx) {
meanX[j] = mean(x[, j]);
sdX[j] = sd(x[, j]);
for (i in 1:Ntotal) {
zx[i, j] = (x[i, j] - meanX[j]) / sdX[j];
}
}
}
parameters {
real zbeta0;
real<lower=0> sigmaBeta;
vector[Nx] zbeta;
real<lower=0> nu;
real<lower=0> zsigma;
}
transformed parameters{
vector[Ntotal] zy_hat;
zy_hat = zbeta0 + zx * zbeta;
}
model {
zbeta0 ~ normal(0, 2);
sigmaBeta ~ gamma(.8, 1.5); // mode 0, sd 0.5
// we want to allow sigma to be as small as necessary; we want it to be zero,
// but let it deviate; when sigma close to zero, sigma is much stronger
zbeta ~ student_t(expLambda, 0, sigmaBeta);
nu ~ exponential(expLambda);
zsigma ~ uniform(unifLo, unifHi);
zy ~ student_t(1 + nu, zy_hat, zsigma);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
real sigma;
// .* and ./ are element-wise product and divide
beta0 = zbeta0 * sdY + meanY - sdY * sum(zbeta .* meanX ./ sdX);
beta = sdY * (zbeta ./ sdX);
sigma = zsigma * sdY;
}
fit_robust_all_preds_shrinkage_more <- sampling(
model_robust_all_preds_shrinkage_more,
data = data_list,
pars = c(
"beta0",
"beta",
"sigma",
"nu"
),
iter = 5000,
chains = 4,
cores = 4,
warmup = 2500,
thin = 5,
control = list(
max_treedepth = 20,
adapt_delta = .93
)
)
\(\beta_1\) and \(\beta_2\) had the most skrinkage. \(\beta5\) became insignificant:
stan_dens(fit_robust_all_preds_shrinkage_more, pars = c("beta0", "beta","sigma", "nu"))
\(\beta_1\), \(\beta_2\), \(\beta_4\), and \(\beta_5\) HDI now include zero.
\(\beta_4\) and \(\beta_5\) barely include zero.
plot(fit_robust_all_preds_shrinkage_more, pars = "beta")
hdi(rstan::extract(fit_robust_all_preds_shrinkage_more, pars = "beta"))
## $beta
##
## [,1] [,2] [,3] [,4] [,5]
## lower -0.27202616 -0.7479470 -1.2104803 -0.0006538085 -0.009799926
## upper 0.02389413 0.1811659 -0.4045162 0.1621068403 1.815403081
## attr(,"credMass")
## [1] 0.95
hdi(rstan::extract(fit_robust_all_preds_shrinkage, pars = "beta"))
## $beta
##
## [,1] [,2] [,3] [,4] [,5]
## lower -0.27595542 -0.7734357 -1.2029135 0.01667599 0.2200677
## upper 0.01099398 0.1912987 -0.4261835 0.16629623 1.8840757
## attr(,"credMass")
## [1] 0.95
hdi(rstan::extract(fit_robust_all_preds, pars = "xbeta"))
## $xbeta
##
## [,1] [,2] [,3] [,4] [,5]
## lower -0.31913810 -0.7861737 -1.2391002 0.03280222 0.3214482
## upper -0.03401911 0.2335651 -0.4687047 0.17688555 1.8540699
## attr(,"credMass")
## [1] 0.95
When introducting skrinkage, there was auto-correlation problems which were fixed by adjusting some parameters.
stan_ac(fit_robust_all_preds_shrinkage_more)
modelString = "
# Standardize the data:
data {
ym <- mean(y)
ysd <- sd(y)
for ( i in 1:Ntotal ) {
zy[i] <- ( y[i] - ym ) / ysd
}
for ( j in 1:Nx ) {
xm[j] <- mean(x[,j])
xsd[j] <- sd(x[,j])
for ( i in 1:Ntotal ) {
zx[i,j] <- ( x[i,j] - xm[j] ) / xsd[j]
}
}
}
# Specify the model for standardized data:
model {
for ( i in 1:Ntotal ) {
zy[i] ~ dt( zbeta0 + sum( delta[1:Nx] * zbeta[1:Nx] * zx[i,1:Nx] ) , 1/zsigma^2 , nu )
}
# Priors vague on standardized scale:
zbeta0 ~ dnorm( 0 , 1/2^2 )
for ( j in 1:Nx ) {
zbeta[j] ~ dt( 0 , 1/sigmaBeta^2 , 1 )
delta[j] ~ dbern( 0.5 )
}
zsigma ~ dunif( 1.0E-5 , 1.0E+1 )
## Uncomment one of the following specifications for sigmaBeta:
# sigmaBeta <- 2.0
# sigmaBeta ~ dunif( 1.0E-5 , 1.0E+2 )
sigmaBeta ~ dgamma(1.1051,0.1051) # mode 1.0, sd 10.0
# sigmaBeta <- 1/sqrt(tauBeta) ; tauBeta ~ dgamma(0.001,0.001)
nu ~ dexp(1/30.0)
# Transform to original scale:
beta[1:Nx] <- ( delta[1:Nx] * zbeta[1:Nx] / xsd[1:Nx] )*ysd
beta0 <- zbeta0*ysd + ym - sum( delta[1:Nx] * zbeta[1:Nx] * xm[1:Nx] / xsd[1:Nx] )*ysd
sigma <- zsigma*ysd
}
"
parameters <- c("beta0",
"beta",
"sigma",
"delta",
"sigmaBeta",
"zbeta0",
"zbeta",
"zsigma",
"nu")
adaptSteps <- 500
burnInSteps <- 1000
numSavedSteps <- 15000
thinSteps <- 25
nChains <- 3
runjagsMethod <- "parallel" # change to "rjags" in case of working on 1-core cpu
runJagsOut <- run.jags(
method = runjagsMethod,
model = modelString,
monitor = parameters,
data = data_list,
n.chains = nChains,
adapt = adaptSteps,
burnin = burnInSteps,
sample = ceiling(numSavedSteps / nChains),
thin = thinSteps,
summarise = FALSE,
plots = FALSE
)
Diagnostics:
# plot all params
plot(runJagsOut,
plot.type = c("trace", "ecdf", "histogram", "autocorr")
)
## Generating summary statistics and plots (these will NOT be saved
## for reuse)...
## Calculating summary statistics...
## Calculating the Gelman-Rubin statistic for 21 variables....
trajectoriesDelta <- as.matrix(runJagsOut$mcmc[, 8:12])
Nchain <- nrow(trajectoriesDelta)
trajectoriesDelta %>% head()
## delta[1] delta[2] delta[3] delta[4] delta[5]
## [1,] 1 0 1 1 0
## [2,] 1 0 1 1 0
## [3,] 1 1 1 1 0
## [4,] 0 0 1 1 1
## [5,] 0 0 1 1 1
## [6,] 1 1 1 1 1
Inclusion Probabilities:
map_dbl(1:5, ~ sum(trajectoriesDelta[, .x] == 1) / Nchain) %>%
set_names(paste0("Delta ", dimnames(data_list$x)[[2]])) %>%
bind_rows() %>%
gather(key = "delta", value = value) %>%
ggplot(aes(fct_rev(fct_reorder(delta, value)), value)) +
geom_col(fill = "skyblue") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Variable Importance MCMC",
x = "Delta",
y = "MCMC Delta Proportion")
Compare relevance of the following sub-models based on their observed frequencies:
Fertility ~ Agriculture + Examination
Fertility ~ Education + Catholic + Infant.Mortality
Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
Fertility ~ .
Fertility ~ Agriculture + Education + Catholic + Infant.Mortality was the most relevant with the highest frequency proportion of ~32%.
delta_checks <- list("Agriculture + Examination" = c(1, 1, 0, 0, 0),
"Education + Catholic + Infant.Mortality" = c(0, 0, 1, 1, 1),
"Agriculture + Education + Catholic + Infant.Mortality" = c(1, 0, 1, 1, 1),
"All predictors" = c(1, 1, 1, 1, 1))
map(delta_checks, ~ sum(apply(trajectoriesDelta, 1, function(z) prod(z == .x)) / Nchain)) %>%
set_names(paste0("Delta ", delta_checks %>% names)) %>%
bind_rows() %>%
gather(key = "delta", value = value) %>%
ggplot(aes(fct_reorder(delta, value), value)) +
geom_col(fill = "skyblue") +
scale_y_continuous(labels = scales::percent) +
coord_flip() +
labs(title = "MCMC Sub-model comparison",
x = "Delta",
y = "MCMC Delta Frequency Proportion")